/******************************************************************************
Description:
*	This program merges lottery numbers with application data to produce
*	a reshaped match file. It then defines ed opt buckets and global priorities,
*	using this information to perform the DA match.
*-----------------------------------------------------------------------------*/
clear all
set more off
set seed 1234

* settings
local build_data 1
local run_da     1
local fy         2016
local ly         2018

* local variable lists
local student_var futureeszone futuremszone futuremszonedistrict resdistrict currentgrade	sped currentschooldbn elaalttestname ///
    elaalttestreadscore elaalttestwritingscore	elaproficiencyrating mathalttestname mathalttestscore mathproficiencyrating	nyseslattestdate ///
    accessiblesiterequired readingcategory dayspresent absent late math	ela	ss sc timemanagementindependence organization perseverance asksforhelp ///
    respectsschoolrulescollaboration talent1 talent1score talent2 talent2score iep roundmatched	tiebreaker zonedschooldbn eszoneddbn daysabsent	///
    dayslate optout	ethnicity sex birth_mm_yyyy ell swd poverty black hispanic white asian other current_district
local offers_var finaloffer finalofferdbn manuallyplacedapplicationchoice finalprogramcode finalprogramname finalschooldbn finalschoolname
local mr_offers_var mrmatch mrmatchdbn mrmp mrmpdbn
local choice_var programname programcode programtype schooldbn choice rank compositescore matchprioritygroup eligible testoutcome matcheligibleyn
local choice_var_num matchprioritygroup rank compositescore testoutcome
local choice_var_string programcode programname schooldbn programtype eligible matcheligibleyn

*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

if `build_data' == 1 {

forvalues year = `fy'/`ly' {
    use "${cleandata}application_all_bio.dta", clear

    keep if year ==`year'

	* keep only the variables that relate to the main round
	drop ns*
	drop appeal*

	*change talent variables
	forvalues t=1/2 {
		cap replace talent`t' = "Art" if talent`t' =="Art Talent Test"
		cap replace talent`t' = "Athletics" if talent`t' =="Athletics Talent Test"
		cap replace talent`t' = "Computer/Math" if talent`t' =="Computer/Math Talent Test"
		cap replace talent`t' = "Dance" if talent`t' =="Dance Talent Test"
		cap replace talent`t' = "Drama" if talent`t' =="Drama Talent Test"
		cap replace talent`t' = "CW/Journalism" if talent`t' =="Journalism Talent Test"
		cap replace talent`t' = "Media" if talent`t' =="Media Talent Test"
		cap replace talent`t' = "Science" if talent`t' =="Science Talent Test"
		cap replace talent`t' = "Strings" if talent`t' =="String Instrument Talent Test"
		cap replace talent`t' = "Winds" if talent`t' =="Wind Instrument Talent Test"
		cap replace talent`t' = "Vocal" if talent`t' =="Vocal Talent Test"
		cap replace talent`t' = "Vocal" if talent`t' =="Vocal music"
		cap replace talent`t' = "Winds" if talent`t' =="Instrumental-winds"
		cap replace talent`t' = "Strings" if talent`t' =="Instrumental-strings"
	}

	*change the programpriority variable
	forval c=1/70 {
		cap replace matchprioritygroup`c' = "" if matchprioritygroup`c' == "-"
		cap destring matchprioritygroup`c', replace
	}

	*reshape the file into long
	if `year' == 2016 expand 59
	if `year' >  2016 expand 12

	drop choice*
	bysort stu: gen choice=_n

	foreach var in `choice_var_num' {
		gen `var'=.
	}
	foreach var in `choice_var_string' {
		gen `var'=""
	}

	if `year'==2016{
		forval c=1/59 {
			foreach var in `choice_var' {
				cap replace `var'=`var'`c' if choice==`c'
			}
		}
	}

	if `year'>2016{
        forvalues c=1/12 {
			foreach var in `choice_var' {
				cap replace `var'=`var'`c' if choice==`c'
			}
		}
	}

    * drop empty observations
	keep if programcode!=""

	*keep useful variables
	keep stu choice `student_var' `mr_offers_var' `choice_var' `offers_var'
	cap rename student_id_scram stu

	merge m:1 programcode using "${cleandata}talent_program_crosswalk.dta"
	drop if _merge==2
	replace testoutcome= talent1score if talent1==talent
	replace testoutcome= talent2score if talent2==talent
	replace programtype = "False Test Outcome" if programtype=="Test Outcome" & _merge !=3

	*some checks
	count if testoutcome==. & _merge ==3
	tab compositescore if _merge ==3, mis
	tab rank if _merge ==3, mis
	tab currentgrade, mis

	drop _merge

	*keep only 5th grade
	keep if currentgrade==5

	*destring poverty
	destring poverty, replace

	* Convert tiebreaker string to lottery number (one for each kid)
	sort tiebreaker
	egen lottery_rank = group(tiebreaker)

	* Rescale lottery rank to [0,1]
	su lottery_rank
	gen double lottery_rank_mod_new = lottery_rank / r(max)
	drop lottery_rank
	ren lottery_rank_mod_new lottery_rank
	gen lottery_rank_missing = lottery_rank == .
	
	* Draw random number if missing lottery rank
	isid stu choice
	so stu choice
	gen double shuffle = runiform(0,1) if lottery_rank== . // Make sure it is double
	replace lottery_rank = shuffle if lottery_rank == .	
		
	* Assert that there aren't applicants who have the same lottery number
	* (NYC vs. our draw)
	egen tag = tag(stu)
	duplicates tag lottery_rank if tag, gen(lottery_dups)
	su lottery_dups
	assert r(max) == 0
	drop tag

	*save the final dataset
	save "${cleandata}nyc_match_reshape`year'_ms_copy.dta", replace
}
}

********************************************************************************
********************************** RUN DA **************************************
********************************************************************************

local group nonsped sped

if `run_da' == 1 {

foreach g in `group'{

forvalues year = `fy'/`ly' {

	 use "${cleandata}nyc_match_reshape`year'_ms.dta", clear

     * select SPED or non-SPED sample
	 if "`g'" == "nonsped"{
		if `year' != 2017 & `year' != 2019 drop if inlist(sped,"ICT","PART-TIME","SC")
		if `year' == 2017 drop if sped =="SE"
		if `year' == 2019 drop if sped =="SWD"
	 }
	 else{
		if `year' != 2017 & `year' != 2019 keep if sped =="ICT" | sped=="PART-TIME" | sped =="SC"
		if `year' == 2017 keep if sped =="SE"
		if `year' == 2019 keep if sped =="SWD"
	 }

	* drop if inelegible for choice (var only available in 2018)
	if `year'==2018 drop if matcheligibleyn!="Y"


********************************************************************************
**************************** Estimate DIA policies *****************************

	if `year' == 2017{

		gen pct_dia=0

        *specify parameters of DIA programs in each year

		*identify DIA programs
		gen prog_dia=0
		replace prog_dia=1 if inlist(schooldbn, "01M450", "02M114", "15K447", "15K839")

		*identify DIA students
		gen stud_dia=0
		replace stud_dia=1 if poverty==1 & inlist(schooldbn, "01M450", "02M114", "15K447", "15K839")

		*specify %DIA seats
		replace pct_dia=0.62 if schooldbn=="01M450"
		replace pct_dia=0.10 if schooldbn=="02M114"
		replace pct_dia=0.30 if schooldbn=="15K447"
		replace pct_dia=0.40 if schooldbn=="15K839"

	    *modify applications file to account for DIA policies

		*separate programs into DIA and open seats
		expand 2 if prog_dia==1
		bysort stu schooldbn: gen new_programcode=programcode+"_DIA" if _n==1 & prog_dia==1
		bysort stu schooldbn: replace new_programcode=programcode+"_OPEN" if _n==2 & prog_dia==1

		*PREFERENCES: reserve seats are filled before open seats
		replace choice=choice+0.5 if substr(new_programcode, -4, 4)=="OPEN"

		*CAPACITIES: specify pct of total seats in a program allocated to DIA vs open seats
		replace pct_dia=1-pct_dia if substr(new_programcode, -4, 4)=="OPEN"

		*PRIORITIES: applicants receive priority for DIA seats if they are FL/FRL/Feeder (specific to each program, FL/FRL approximated by poverty status)
		bysort programcode: egen max_priority=max(matchprioritygroup) /*figure out total number of priority groups at each program*/
		replace matchprioritygroup=matchprioritygroup+max_priority if prog_dia==1 & substr(new_programcode, -3, 3)=="DIA" & stud_dia==0 /*shift back priorities for non-DIA students in DIA seats*/

		*rename program codes
		replace programcode=new_programcode if prog_dia==1
		drop new_programcode
	}

	if `year'==2018{

		gen pct_dia=0
		gen pct_dia2=0

        *specify parameters of DIA programs in each year

		*identify DIA programs
		gen prog_dia=0
		replace prog_dia=1 if inlist(schooldbn,"01M450", "02M114", "15K447", "15K839", "02M260", "07X343", "15K497")

		*identify DIA students
		gen stud_dia=0
		replace stud_dia=1 if poverty==1 & inlist(schooldbn, "01M450", "02M114", "15K447", "15K839", "02M260", "15K497")
		replace stud_dia=1 if  inlist(currentschooldbn, "07X001","07X049" ,"07X154" ,"07X277" ,"07X359" ,"07X369") & schooldbn=="07X343"
		replace stud_dia=2 if inlist(currentschooldbn, "07X005","07X018" ,"07X025" ,"07X029" ,"07X031" ,"07X157" ,"07X161") & schooldbn=="07X343"

		*specify %DIA seats
		replace pct_dia=0.62 if schooldbn=="01M450"
		replace pct_dia=0.10 if schooldbn=="02M114"
		replace pct_dia=0.30 if schooldbn=="15K447"
		replace pct_dia=0.40 if schooldbn=="15K839"
		replace pct_dia=0.17 if schooldbn=="02M260"
		replace pct_dia=0.25 if schooldbn=="07X343"
		replace pct_dia2=0.15 if schooldbn=="07X343"
		replace pct_dia=0.40 if schooldbn=="15K497"

	    *modify applications file to account for DIA policies

		*separate programs into DIA and open seats
		expand 2 if prog_dia==1 & schooldbn!="07X343", gen(exp)
		gen aux =3 if schooldbn=="07X343" & prog_dia==1
		expand aux if prog_dia==1 & schooldbn=="07X343", gen(exp2)
		drop aux
		sort stu schooldbn programcode exp exp2
		bysort stu schooldbn programcode: gen new_programcode=programcode+"_DIA" if exp==0 & exp2==0 & prog_dia==1
		bysort stu schooldbn programcode: replace new_programcode=programcode+"_OPEN" if exp==1 & exp2==0 & prog_dia==1
		bysort stu schooldbn programcode: replace new_programcode=programcode+"_DIA2" if _n==2 & prog_dia==1 & exp2==1
		bysort stu schooldbn programcode: replace new_programcode=programcode+"_OPEN" if _n==3 & prog_dia==1 & exp2==1
		drop exp exp2

		*PREFERENCES: reserve seats are filled before open seats, dia seats before dia2 seats
		replace choice=choice+0.5 if substr(new_programcode, -4, 4)=="OPEN"
		replace choice=choice+0.25 if substr(new_programcode, -4, 4)=="DIA2"

		*CAPACITIES: specify pct of total seats in a program allocated to DIA vs open seats
		replace pct_dia=1-(pct_dia + pct_dia2) if substr(new_programcode, -4, 4)=="OPEN"
		replace pct_dia= pct_dia2 if substr(new_programcode, -4, 4)=="DIA2"
		drop pct_dia2

		*PRIORITIES: applicants receive priority for DIA seats if they are FL/FRL/Feeder (specific to each program, FL/FRL approximated by poverty status)
		bysort programcode: egen max_priority=max(matchprioritygroup) /*figure out total number of priority groups at each program*/
		replace matchprioritygroup=matchprioritygroup+max_priority if prog_dia==1 & substr(new_programcode, -3, 3)=="DIA" & stud_dia!=1 /*shift back priorities for non-DIA students in DIA seats*/
		replace matchprioritygroup=matchprioritygroup+max_priority if prog_dia==1 & substr(new_programcode, -4, 4)=="DIA2" & stud_dia!=2 /*shift back priorities for non-DIA students in DIA seats*/

		*rename program codes
		replace programcode=new_programcode if prog_dia==1
		drop new_programcode

	}

	if `year'==2018{
		* K235M: district priority missing + not sure rank actually matters
		replace matchprioritygroup =2 if matchprioritygroup==1 & programcode=="K235M"
		replace matchprioritygroup =1 if substr(futuremszone, 1, 2) =="18" & programcode=="K235M"
		replace rank =1 if rank !=. &  programcode=="K235M"
	}

    *otherwise, assume that missing values for priority, pclrank, or compositescore means that the student is last in line
	*(e.g. for programs where pclrank is either 1 or missing, then missing is essentially zero. These are likely limited unscreened programs, per guidance from NYCDOE on 6/5/19 call)
	replace matchprioritygroup=100000000 if matchprioritygroup==.

	* Save data modified for DIA policies
	if `year' >= 2017 {
        save "${cleandata}/nyc_match_reshape`year'_ms_dia_`g'.dta", replace
    }

    * Code ranks for limited unscreened programs into priorities
    replace matchprioritygroup = matchprioritygroup * 10 if programtype=="Limited Unscreened" & mi(rank)
    replace rank = . if programtype=="Limited Unscreened"

********************************************************************************
***************************** Create DA inputs ********************************

	    *create choice file
		preserve
			sort stu choice
			order stu choice
			format stu programcode %20s
			outsheet stu programcode using "${cleandata}da/choice_file_R1_current_forpscore_ms_`year'_`g'.txt", replace delimiter(" ") nonames
		restore

		*create capacities file
		preserve
			gen offer=0
			replace offer=1 if programcode==mrmatch
			replace offer=1 if substr(programcode, 1, length(mrmatch))==mrmatch & mrmatch!=""
			bysort programcode: egen offers=sum(offer)
			if `year'>2016{
				replace offers=round(offers*pct_dia) if pct_dia>0
			}
			keep programcode offers
			duplicates drop
			order programcode offers
			outsheet using "${cleandata}da/`year'_capacity_ms.txt", replace delimiter(" ") nonames
			outsheet using "${cleandata}da/`year'_capacity_nsped_ms.txt", replace delimiter(" ") nonames
			rename offers capacity

			outsheet using "${cleandata}da/capacity_R1_current_forpscore_ms_`year'_`g'.txt", replace delimiter(" ") nonames

		restore

		*create priorities file
		preserve
			gsort programcode matchprioritygroup rank -compositescore -testoutcome lottery_rank stu
			order programcode stu
			keep  programcode stu
			outsheet using "${cleandata}da/`year'_priorities_ms.txt", replace delimiter(" ") nonames
			outsheet programcode stu using "${cleandata}da/priority_file_R1_current_forpscore_ms_`year'_`g'.txt", replace delimiter(" ") nonames
		restore

		* Output student ids
		preserve
			keep  stu
			gen rsid = stu
			destring rsid, replace
			format rsid %20.0g
			order stu rsid
			outsheet using "${cleandata}da/rsid_file_R1_current_forpscore_ms_`year'_`g'.txt", replace delimiter(" ") nonames
		restore

		global year =`year'

		sleep 1000

		// *** RUN DA ***
		shell perl "$ado\daaNYC_sims.pl" ///
			"${cleandata}da/capacity_R1_current_forpscore_ms_`year'_`g'.txt" ///
			"${cleandata}da/choice_file_R1_current_forpscore_ms_`year'_`g'.txt" ///
			"${cleandata}da/priority_file_R1_current_forpscore_ms_`year'_`g'.txt" ///
			"${cleandata}da/match_R1_current_forpscore_ms_`year'_`g'.txt"

		sleep 1000

		* Read in DA assignment output file, and merge assignments into our data
		preserve
			insheet using "${cleandata}da/match_R1_current_forpscore_ms_`year'_`g'.txt", clear names delimiter(" ")
			ren student_id stu
			tempfile match
			sa "`match'"
		restore

	****************************************************************************
	*Clean post-match
	****************************************************************************

		if `year' < 2017 {
			use "${cleandata}nyc_match_reshape`year'_ms", clear
			*otherwise, assume that missing values for priority, pclrank, or compositescore means that the student is last in line
			*(e.g. for programs where pclrank is either 1 or missing, then missing is essentially zero. These are likely limited unscreened programs, per guidance from NYCDOE on 6/5/19 call)
			replace matchprioritygroup=100000000 if matchprioritygroup==.
		}
		else{
			use "${cleandata}nyc_match_reshape`year'_ms_dia_`g'", clear
		}

        * select SPED or non-SPED sample
		if "`g'" == "nonsped"{
			if `year' !=2017 & `year' !=2019 drop if inlist(sped,"ICT","PART-TIME","SC")
			if `year' ==2017 drop if sped =="SE"
			if `year' ==2019 drop if sped =="SWD"
		}
		else{
			if `year' !=2017 & `year' !=2019 keep if inlist(sped,"ICT","PART-TIME","SC")
			if `year' ==2017 keep if sped =="SE"
			if `year' ==2019 keep if sped =="SWD"
		}

        * Code ranks for limited unscreened programs into priorities
        replace matchprioritygroup = matchprioritygroup * 10 if programtype=="Limited Unscreened" & mi(rank)
        replace rank = . if programtype=="Limited Unscreened"

	* ----------------------------------------------------------------------------
	*  Generate offer vars
	* ----------------------------------------------------------------------------
		* Generate actual offer indicator (level = student X program)
		gen offer=0
		replace offer=1 if programcode==mrmatch
		replace offer=1 if substr(programcode, 1, length(mrmatch))==mrmatch & mrmatch!=""

		* Generate ever_offer indicator (level = student)
		bys stu : egen ever_offer = max(offer)

		* Generate higher_offer indicator (level = student X program)
		sort stu choice
		by stu : gen higher_offer = 0 if _n == 1
		by stu : replace higher_offer = max(offer[_n-1] , higher_offer[_n-1] ) if _n > 1

		* Generate total_offers (level = program)
		bys programcode: egen total_offers = total(offer)
		if `year'>2016 replace total_offers=round(total_offers*pct_dia) if pct_dia>0

	*************************************************************************
		* Generate global priority variable
	*************************************************************************

        * Unlike high schools, there are no Ed. Opt programs
        * This obviates the need to generate a global priority variable
        * Set the global priority variable equal to the original matchprioritygroup
		gen global_priority = matchprioritygroup

        * keep relevant vars
		if `year'>=2017 keep stu *dia* choice *priority compositescore* testoutcome* lottery_rank* rank offer *match* `student_var' `offers_var' mrmatch mrmatchdbn schooldbn program*
		else keep stu choice *priority compositescore* testoutcome* lottery_rank* rank offer *match* `student_var' `offers_var' mrmatch mrmatchdbn schooldbn program*

		* ----------------------------------------------------------------------------
		*  Clean post-match data
		* ----------------------------------------------------------------------------

		destring stu, replace
		merge m:1 stu using "`match'", gen(has_our_match)

        * In the perl script, if an applicant is not matched, they are assigned to their ID.
		* Replace these as missing here
		tostring stu, gen(temp)
		replace ourmatch = "" if ourmatch == temp
		drop temp

		* Generate stripped_match, which removes the DIA program tags from
		* program names (e.g. 07X343_OPEN --> 07X343)
		gen stripped_match = subinstr(ourmatch,"_OPEN","",.)
		replace stripped_match = subinstr(stripped_match,"_DIA2","",.)
		replace stripped_match = subinstr(stripped_match,"_DIA","",.)

		* Flag where actual assignment is the same as our DA assignment
			//note that this doesn't require us to get the ed opt bucket right
        gen same = mrmatch == stripped_match

		* Flag our DA assignments
		*gen my_offer = programcode == stripped_match
		gen stripped_programcode = programcode
		replace stripped_programcode = subinstr(stripped_programcode ,"_OPEN","",.)
		replace stripped_programcode = subinstr(stripped_programcode ,"_DIA2","",.)
		replace stripped_programcode = subinstr(stripped_programcode ,"_DIA","",.)
		gen my_offer = stripped_programcode == stripped_match

		* Output file for simulated offers
        preserve
            keep stu ourmatch stripped_match
            duplicates drop
            save "${cleandata}best_guess_simulated_match_ms_`year'_`g'.dta", replace
        restore

		* Determine the kind of program the student was offered *in our match*
		preserve
			keep if my_offer == 1
			keep stripped_match programtype
			duplicates drop
			ren programtype myoffered_program_type
			tempfile offered_progs
			save "`offered_progs'"
		restore

		merge m:1 stripped_match using "`offered_progs'", gen(prog_match)

		* Generate rank that applies to the school we matched a student to
		preserve
			keep if my_offer == 1
			keep stu rank choice
			duplicates drop
			* For students who matched with DIA schools, keep DIA version of program
			bys stu: egen max_choice = max(choice)
			drop if max_choice != choice
			drop max_choice
			ren rank rank_matched
			ren choice choice_matched
			tempfile rank_matched
			save "`rank_matched'"
		restore

		merge m:1 stu using "`rank_matched'", nogen

		* Determine the kind of program the student was _actually_ offered
		preserve
			keep if offer == 1
			keep mrmatch programtype offer
			duplicates drop
			ren programtype offered_program_type
			tempfile offered_progs
			save "`offered_progs'"
		restore

		merge m:1 mrmatch using "`offered_progs'", gen(actual_prog_match)

		* This is the output we use to compute the pscores
		ren offer orig_student_offer
		sa "${cleandata}match_file_for_pscore_with_sim_lotteries_ms_`year'_new_`g'.dta", replace
}

}

}
